當我們的服務上線後,最怕的就是「不知道系統發生什麼事」。
想像一下這個情境:
這就是為什麼我們需要 可觀測性 (Observability) 的三大支柱:
這是目前最受歡迎的開源監控解決方案:
Prometheus:
Grafana:
Elasticsearch + Logstash + Kibana:
mkdir -p roles/monitoring/{tasks,templates,defaults,handlers}
---
# roles/monitoring/defaults/main.yml
prometheus_version: "3.6.0"
grafana_version: "12.2.0"
# Prometheus 配置
prometheus_port: 9090
prometheus_data_dir: /opt/prometheus/data
prometheus_config_dir: /etc/prometheus
# Grafana 配置
grafana_port: 3000
grafana_data_dir: /var/lib/grafana
grafana_admin_password: "admin123"
# 監控目標
monitoring_targets:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
- job_name: 'web-servers'
static_configs:
- targets: "{{ groups['web'] | map('extract', hostvars, 'ansible_default_ipv4') | map(attribute='address') | map('regex_replace', '^(.*)$', '\\1:9100') | list }}"
---
# roles/monitoring/tasks/main.yml
- name: Create monitoring users and directories
block:
- name: Create prometheus user
user:
name: prometheus
system: yes
shell: /bin/false
home: "{{ prometheus_data_dir }}"
create_home: no
- name: Create prometheus directories
file:
path: "{{ item }}"
state: directory
owner: prometheus
group: prometheus
mode: '0755'
loop:
- "{{ prometheus_config_dir }}"
- "{{ prometheus_data_dir }}"
- /opt/prometheus
- name: Download and install Prometheus
unarchive:
src: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz"
dest: /opt
remote_src: yes
owner: prometheus
group: prometheus
creates: "/opt/prometheus-{{ prometheus_version }}.linux-amd64"
register: prometheus_download
- name: Create prometheus symlink
file:
src: "/opt/prometheus-{{ prometheus_version }}.linux-amd64"
dest: /opt/prometheus/current
state: link
owner: prometheus
group: prometheus
when: prometheus_download.changed
- name: Deploy prometheus configuration
template:
src: prometheus.yml.j2
dest: "{{ prometheus_config_dir }}/prometheus.yml"
owner: prometheus
group: prometheus
mode: '0644'
notify: restart prometheus
- name: Deploy prometheus systemd service
template:
src: prometheus.service.j2
dest: /etc/systemd/system/prometheus.service
mode: '0644'
notify:
- reload systemd
- restart prometheus
- name: Install Node Exporter
include_tasks: node_exporter.yml
- name: Install and configure Grafana
include_tasks: grafana.yml
- name: Start and enable monitoring services
systemd:
name: "{{ item }}"
state: started
enabled: yes
daemon_reload: yes
loop:
- prometheus
- node-exporter
- grafana-server
---
# roles/monitoring/tasks/node_exporter.yml
- name: Download Node Exporter
unarchive:
src: "https://github.com/prometheus/node_exporter/releases/download/v1.9.1/node_exporter-1.9.1.linux-amd64.tar.gz"
dest: /opt
remote_src: yes
owner: prometheus
group: prometheus
creates: "/opt/node_exporter-1.9.1.linux-amd64"
- name: Create node-exporter symlink
file:
src: "/opt/node_exporter-1.9.1.linux-amd64"
dest: /opt/node-exporter
state: link
- name: Deploy node-exporter systemd service
template:
src: node-exporter.service.j2
dest: /etc/systemd/system/node-exporter.service
notify: reload systemd
---
# roles/monitoring/tasks/grafana.yml
- name: Add Grafana repository key
apt_key:
url: https://packages.grafana.com/gpg.key
state: present
when: ansible_os_family == "Debian"
- name: Add Grafana repository
apt_repository:
repo: "deb https://packages.grafana.com/oss/deb stable main"
state: present
when: ansible_os_family == "Debian"
- name: Install Grafana
package:
name: grafana
state: present
- name: Configure Grafana
template:
src: grafana.ini.j2
dest: /etc/grafana/grafana.ini
backup: yes
notify: restart grafana
- name: Import Grafana dashboards
uri:
url: "http://localhost:{{ grafana_port }}/api/dashboards/db"
method: POST
headers:
Content-Type: "application/json"
Authorization: "Basic {{ ('admin:' + grafana_admin_password) | b64encode }}"
body_format: json
body:
dashboard:
id: null
title: "Node Exporter Full"
tags: ["prometheus", "node-exporter"]
timezone: "browser"
panels: []
time:
from: "now-6h"
to: "now"
refresh: "30s"
folderId: 0
overwrite: true
retries: 5
delay: 10
Prometheus 配置模板:
# roles/monitoring/templates/prometheus.yml.j2
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
{% for target in monitoring_targets %}
- job_name: '{{ target.job_name }}'
static_configs:
{% for config in target.static_configs %}
- targets: {{ config.targets }}
{% endfor %}
{% endfor %}
# 自動發現 Docker 容器
- job_name: 'docker'
docker_sd_configs:
- host: unix:///var/run/docker.sock
relabel_configs:
- source_labels: [__meta_docker_container_name]
target_label: container_name
系統服務模板:
# roles/monitoring/templates/prometheus.service.j2
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/current/prometheus \
--config.file={{ prometheus_config_dir }}/prometheus.yml \
--storage.tsdb.path={{ prometheus_data_dir }} \
--web.console.templates=/opt/prometheus/current/consoles \
--web.console.libraries=/opt/prometheus/current/console_libraries \
--web.listen-address=0.0.0.0:{{ prometheus_port }} \
--web.external-url=http://{{ ansible_default_ipv4.address }}:{{ prometheus_port }}
[Install]
WantedBy=multi-user.target
---
# roles/monitoring/handlers/main.yml
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart prometheus
systemd:
name: prometheus
state: restarted
- name: restart grafana
systemd:
name: grafana-server
state: restarted
---
# roles/elk/tasks/main.yml
- name: Install Java (required for Elasticsearch and Logstash)
package:
name: openjdk-11-jdk
state: present
- name: Add Elastic repository
block:
- name: Add Elasticsearch signing key
apt_key:
url: https://artifacts.elastic.co/GPG-KEY-elasticsearch
state: present
- name: Add Elastic repository
apt_repository:
repo: "deb https://artifacts.elastic.co/packages/8.x/apt stable main"
state: present
- name: Install Elasticsearch
package:
name: elasticsearch
state: present
notify: restart elasticsearch
- name: Configure Elasticsearch
template:
src: elasticsearch.yml.j2
dest: /etc/elasticsearch/elasticsearch.yml
backup: yes
notify: restart elasticsearch
- name: Install and configure Kibana
include_tasks: kibana.yml
- name: Install and configure Logstash
include_tasks: logstash.yml
- name: Start and enable ELK services
systemd:
name: "{{ item }}"
state: started
enabled: yes
daemon_reload: yes
loop:
- elasticsearch
- kibana
- logstash
# roles/elk/templates/logstash.conf.j2
input {
beats {
port => 5044
}
# 收集系統日誌
file {
path => "/var/log/syslog"
type => "syslog"
start_position => "beginning"
}
# 收集 Nginx 日誌
file {
path => "/var/log/nginx/access.log"
type => "nginx_access"
start_position => "beginning"
}
}
filter {
if [type] == "nginx_access" {
grok {
match => {
"message" => "%{NGINXACCESS}"
}
}
date {
match => [ "timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
}
mutate {
convert => {
"response_code" => "integer"
"bytes" => "integer"
}
}
}
if [type] == "syslog" {
grok {
match => {
"message" => "%{SYSLOGTIMESTAMP:timestamp} %{IPORHOST:host} %{PROG:program}: %{GREEDYDATA:message}"
}
}
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "%{type}-%{+YYYY.MM.dd}"
}
# Debug 輸出
stdout {
codec => rubydebug
}
}
---
# roles/monitoring/tasks/alertmanager.yml
- name: Download and install AlertManager
unarchive:
src: "https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz"
dest: /opt
remote_src: yes
owner: prometheus
group: prometheus
- name: Configure AlertManager
template:
src: alertmanager.yml.j2
dest: /etc/prometheus/alertmanager.yml
owner: prometheus
group: prometheus
notify: restart alertmanager
- name: Deploy alert rules
template:
src: alert_rules.yml.j2
dest: /etc/prometheus/rules/alert_rules.yml
owner: prometheus
group: prometheus
notify: restart prometheus
# roles/monitoring/templates/alert_rules.yml.j2
groups:
- name: system_alerts
rules:
# CPU 使用率過高
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}"
# 記憶體使用率過高
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 85% on {{ $labels.instance }}"
# 磁碟空間不足
- alert: DiskSpaceLow
expr: node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"} * 100 < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Disk space running low"
description: "Disk space is below 10% on {{ $labels.instance }} filesystem {{ $labels.mountpoint }}"
# 服務掛掉
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} is down"
# roles/monitoring/templates/alertmanager.yml.j2
global:
slack_api_url: '{{ slack_webhook_url }}'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'slack-notifications'
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#alerts'
username: 'Prometheus'
icon_emoji: ':fire:'
title: 'Alert: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
actions:
- type: button
text: 'View in Grafana'
url: 'http://{{ grafana_url }}/d/node-exporter-full'
---
# monitoring-deploy.yml
- name: Deploy complete monitoring stack
hosts: monitoring
become: yes
vars:
slack_webhook_url: "{{ vault_slack_webhook }}"
grafana_admin_password: "{{ vault_grafana_password }}"
roles:
- monitoring
- elk
post_tasks:
- name: Wait for services to be ready
uri:
url: "http://{{ ansible_default_ipv4.address }}:{{ item }}/api/health"
method: GET
status_code: 200
retries: 30
delay: 10
loop:
- 9090 # Prometheus
- 3000 # Grafana
- name: Display access URLs
debug:
msg:
- "Prometheus: http://{{ ansible_default_ipv4.address }}:9090"
- "Grafana: http://{{ ansible_default_ipv4.address }}:3000 (admin/{{ grafana_admin_password }})"
- "Kibana: http://{{ ansible_default_ipv4.address }}:5601"
# 部署到應用伺服器的 Node Exporter
- name: Deploy monitoring agents
hosts: web,db
become: yes
tasks:
- name: Install Node Exporter
include_role:
name: monitoring
tasks_from: node_exporter.yml
- name: Configure Filebeat for log shipping
include_role:
name: elk
tasks_from: filebeat.yml
重要指標包括:
# CPU 使用率
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# 記憶體使用率
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
# 磁碟 I/O
rate(node_disk_reads_completed_total[5m])
rate(node_disk_writes_completed_total[5m])
# 網路流量
rate(node_network_receive_bytes_total{device!="lo"}[5m])
rate(node_network_transmit_bytes_total{device!="lo"}[5m])
# HTTP 請求量 (需要應用程式提供 metrics)
rate(http_requests_total[5m])
# HTTP 錯誤率
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100
明天我們來看看容器與雲端整合,學習如何用 Ansible 管理 Docker 和雲端資源!